import logging
from init import PATHS
LOGGER = logging.getLogger(__name__)
import pandas as pd
import numpy as np


def oems_level1():
    OEMs = pd.read_csv(PATHS.marklines / 'OEMs.csv')
    OEMs['Level1_MLName'] = OEMs['Level1_MLName'].fillna(OEMs['Level1_OrbisName'])
    OEMs = OEMs[['OEM_Level1_ID', 'Level1_MLName', 'Level2_bvdid']].drop_duplicates()
    OEMs_fct = pd.read_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/OEMs_factset.csv')
    OEMs_fct = OEMs_fct[['bvdid', 'fctid', 'fctname', 'fct_home_region']].drop_duplicates()
    OEMs = OEMs.merge(OEMs_fct, left_on='Level2_bvdid', right_on='bvdid', how='left')
    OEMs = OEMs.drop(columns='bvdid')
    OEMs = OEMs[OEMs['fctid'].notnull()]
    OEMs['fctid'] = OEMs['fctid'].astype(int)
    return OEMs


def get_supplychain_data_of_oems(OEMs):
    # IMPORT supply-chain DATA
    dataSC = pd.read_csv(PATHS.factset / 'SUPPLY_CHAIN.csv')
    dataSC = dataSC[['START$', 'END$', 'SUPPLIER_ID', 'CUSTOMER_ID']].drop_duplicates()
    # KEEP supply-chain DATA OF AUTO PRODUCERS
    dataSC = OEMs.merge(dataSC, left_on='fctid', right_on='CUSTOMER_ID', how='inner')
    dataSC = dataSC.drop_duplicates()
    dataSC['SUPPLIER_ID'] = dataSC['SUPPLIER_ID'].astype(int)
    dataSC['CUSTOMER_ID'] = dataSC['CUSTOMER_ID'].astype(int)
    return dataSC


def yearvar(row, year):
    if (row['Year_start'] <= year) & (year <= row['Year_end']):
        return True
    else:
        return False


def check_if_need_to_add_fctset_ids_fromaliases(dataSC):
    df_Factset_alias = pd.read_csv(PATHS.factset / 'COMPANY_ALIAS.csv', sep=',')
    list_ids = dataSC[dataSC['SUPPLIER_ID'].notnull()]['SUPPLIER_ID'].tolist()
    df_Factset_alias = df_Factset_alias[df_Factset_alias['COMPANY_ID'].isin(list_ids)]
    df_Factset_alias = df_Factset_alias[~df_Factset_alias['ALIAS_ID'].isin(list_ids)]
    nbrrowstoadd = dataSC[dataSC['SUPPLIER_ID'].isin(df_Factset_alias['ALIAS_ID'].tolist())].shape[0]
    LOGGER.info('Nbr of rows to add from Aliases: {}'.format(nbrrowstoadd))
    return dataSC


def check_if_need_to_add_fctset_ids_fromredirect(dataSC):
    list_ids = dataSC[dataSC['SUPPLIER_ID'].notnull()]['SUPPLIER_ID'].tolist()
    # Factset ids can change over time. make sure, all relevant ids are included
    df_Factset_redirect = pd.read_csv(PATHS.factset / 'COMPANY_REDIRECT.csv', sep=',')
    df_Factset_redirect = df_Factset_redirect[df_Factset_redirect['REASON'].isin(['DUPLICATE', 'ERROR'])]
    df_Factset_redirect['REF_COMPANY_ID'] = df_Factset_redirect['REF_COMPANY_ID'].replace('None', np.nan).astype(float)
    df_Factset_redirect['COMPANY_ID'] = df_Factset_redirect['COMPANY_ID'].astype(float)
    df_Factset_redirect = df_Factset_redirect[df_Factset_redirect['REF_COMPANY_ID'].notnull()]
    df_Factset_redirect = df_Factset_redirect[df_Factset_redirect['COMPANY_ID'].notnull()]
    df_Factset_redirect = df_Factset_redirect[df_Factset_redirect['COMPANY_ID'].isin(list_ids)]
    nbrrowstoadd = dataSC[dataSC['SUPPLIER_ID'].isin(df_Factset_redirect['REF_COMPANY_ID'].tolist())].shape[0]
    LOGGER.info('Nbr of supplier factset ids to connect from Redirect: {}'.format(nbrrowstoadd))
    new_id_dict = dict(zip(df_Factset_redirect['REF_COMPANY_ID'], df_Factset_redirect['COMPANY_ID']))
    # Replace factset ids of suppliers in dataSC that are duplicates
    dataSC['SUPPLIER_ID'] = dataSC['SUPPLIER_ID'].apply(lambda x: new_id_dict[x] if x in new_id_dict.keys() else x)
    return dataSC


def create_year_variables(dataSC):
    # CONSTRUCT YEAR VARIABLES
    dataSC['START$'] = pd.to_datetime(dataSC['START$'])
    dataSC['END$'] = dataSC['END$'].replace('4000-01-01 00:00:00', '2018-01-01 00:00:00')
    dataSC['END$'] = pd.to_datetime(dataSC['END$'])
    # the last year in the data is jan 1st 2018, so it's really 2017
    dataSC['Year_start'] = pd.DatetimeIndex(dataSC['START$']).year
    dataSC['Year_end'] = pd.DatetimeIndex(dataSC['END$']).year
    first_year = dataSC['START$'].describe()['first'].year
    last_year = dataSC['END$'].describe()['last'].year
    LOGGER.info('First year: {}. Last year: {}'.format(first_year, last_year))
    # column 2004 is True if relationship existed in 2004 (if started in dec 2004, it still qualifies...)
    for year in range(first_year, last_year):
        dataSC[str(year)] = dataSC.apply(lambda row: yearvar(row, year), axis=1)
    return dataSC


def flag_suppliers_with_relevant_NAICS(dataSC):
    NAICStokeep = ['33', '32', '31', '54', '42', '44']
    suppliers_naics = pd.read_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/suppliers_naics.csv')
    suppliers_naics['NAICS_2digit'] = suppliers_naics['NAICS, Primary code(s)'].astype(str).str[:2]
    suppliers_naics = suppliers_naics[suppliers_naics['NAICS_2digit'].isin(NAICStokeep)]
    suppliers_ids = pd.read_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/suppliers_ids.csv')
    suppliers_naics = suppliers_naics.merge(suppliers_ids[['fctid', 'bvdid']], left_on='BvD ID number', right_on='bvdid', how='left')
    list_fctid_suppliersinNAICS = suppliers_naics['fctid'].tolist()
    dataSC['Sup_NAICS'] = dataSC['SUPPLIER_ID'].apply(lambda x: True if x in list_fctid_suppliersinNAICS else False)
    return dataSC


def aggregate_at_OEM_Level1_and_supplier_level(dataSC):
    # AGGREGATE AT OEM_Level1_ID and SUPPLIER_ID (ie agg the different prod id to the ipn id)
    list_columns = [k for k in dataSC.columns if k.isdigit()]
    dataSC_agg = pd.DataFrame(dataSC.groupby(['OEM_Level1_ID', 'SUPPLIER_ID'])[list_columns[0]].agg(lambda x: True if True in x.unique() else False))
    dataSC_agg = dataSC_agg.rename(columns={'2003': 'Active2003'})
    for year in list_columns[1:]:
        dataSC_agg['Active' + str(year)] = dataSC.groupby(['OEM_Level1_ID', 'SUPPLIER_ID'])[str(year)].agg(lambda x: True if True in x.unique() else False)
    dataSC_agg = dataSC_agg.reset_index()
    dataSC_agg['SUPPLIER_ID'] = dataSC_agg['SUPPLIER_ID'].astype(int)
    dataSC_agg = dataSC_agg.merge(dataSC[['SUPPLIER_ID', 'Sup_NAICS']].drop_duplicates(), on='SUPPLIER_ID', how='left')
    return dataSC_agg


def add_oems_name_and_region(dataSC):
    # ADD oem names and regions
    oem_name = dataSC.groupby('OEM_Level1_ID')['Level1_MLName'].agg(lambda x: list(x.unique())[0])
    oem_region = dataSC[dataSC['fct_home_region'] != 'None'].groupby('OEM_Level1_ID')['fct_home_region'].agg(lambda x: ','.join([str(i) for i in list(x.unique())])).rename('oem_fctregion')
    oem_orbisregion = dataSC.groupby('OEM_Level1_ID')['Level2_bvdid'].agg(lambda x: ','.join(list(set([str(i)[:2] for i in x.unique()])))).rename('oem_orbisregion')
    oem_info = pd.concat([oem_name, oem_region, oem_orbisregion], axis=1).reset_index()
    return oem_info


def add_suppliers_name_and_region():
    # Add supplier names and regions
    df_suppliers = pd.read_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/suppliers_ids.csv')
    supplier_name = df_suppliers.groupby('fctid')['fctname'].agg(lambda x: list(x.unique())[0]).rename('supplier_fctname')
    supplier_region = df_suppliers.groupby('fctid')['fct_home_region'].agg(lambda x: ','.join([str(i) for i in list(x.unique())])).rename('supplier_fctregion')
    supplier_orbisregion = df_suppliers.groupby('fctid')['orbis_region'].agg(lambda x: ','.join(list(set([str(i) for i in x.unique()])))).rename('supplier_orbisregion')
    suppliers_info = pd.concat([supplier_name, supplier_region, supplier_orbisregion], axis=1).reset_index().rename(columns={'fctid': 'SUPPLIER_ID'})
    return suppliers_info

def add_year_Supp_appears_in_FctSet(dataSC):
    company_basics = pd.read_csv(PATHS.factset / 'COMPANY_withISIN.csv', sep=',', usecols=['NAME', 'ID', 'START$'])
    company_basics['START$'] = pd.to_datetime(company_basics['START$'])
    company_basics['year_company_first_appears'] = pd.DatetimeIndex(company_basics['START$']).year ## not sure why, but some companies have multiple start dates
    company_basics = company_basics.groupby(['ID'])['year_company_first_appears'].apply(lambda x: min(x)).reset_index()   
    supplierMask = company_basics['ID'].isin(dataSC['SUPPLIER_ID'].values)
    company_basics = company_basics[supplierMask].rename(columns = {'ID':'SUPPLIER_ID'}) 
    return company_basics[['SUPPLIER_ID','year_company_first_appears']]

def add_variable_number_of_links_suppliers_have_in_year_t(prod_sup_panel):
    # CREATE VARIABLE FOR THE NBR LINKS SUPPLIERS HAVE
    # in year t, captures the number of producers that supplier j works with
    mask = prod_sup_panel['Active']
    supp_links_panel = prod_sup_panel[mask].groupby(['SUPPLIER_ID', 'Year'])['OEM_Level1_ID'].nunique().rename('SuppLinks').reset_index()
    # add VARIABLE NBR OF LINKS OF SUPPLIERS IN THAT YEAR TO prod_sup_panel
    prod_sup_panel = prod_sup_panel.merge(supp_links_panel, on=['SUPPLIER_ID', 'Year'], how='left')
    prod_sup_panel = prod_sup_panel.fillna(0)
    prod_sup_panel['SuppLinks'] = prod_sup_panel['SuppLinks'].astype(int)
    return prod_sup_panel


def create_panel_oem_supplier_year(dataSC_agg):
    # CREATE PANEL PROD SUPP YEAR
    prod_sup_panel = pd.wide_to_long(dataSC_agg, stubnames='Active', i=['OEM_Level1_ID', 'SUPPLIER_ID'], j='Year').reset_index()
    # CREATE DURATION VARIABLE, i.e. total number of numbers when  the relationship is active in factset
    # year = 2015
    df = []
    for year in range(2003, 2018):
        mask = prod_sup_panel['Year'] <= year
        df1 = prod_sup_panel[mask].groupby(['OEM_Level1_ID', 'SUPPLIER_ID'])['Active'].sum().rename('CumulativeAgeOfLink').reset_index()
        df1['Year'] = year
        df.append(df1)
    df = pd.concat(df)
    prod_sup_panel = prod_sup_panel.merge(df, on=['OEM_Level1_ID', 'SUPPLIER_ID', 'Year'], how='left')
    # create a variable that gives the number of years since th relationship was first active in the data
    year_first_active = prod_sup_panel[prod_sup_panel['Active']].groupby(['OEM_Level1_ID', 'SUPPLIER_ID'])['Year'].min().rename('Years_since_firstactive')
    prod_sup_panel = prod_sup_panel.merge(year_first_active, on=['OEM_Level1_ID', 'SUPPLIER_ID'], how='left')
    prod_sup_panel['Years_since_firstactive'] = prod_sup_panel['Year'] - prod_sup_panel['Years_since_firstactive']
    prod_sup_panel = add_variable_number_of_links_suppliers_have_in_year_t(prod_sup_panel)
    cols = ['OEM_Level1_ID', 'Level1_MLName', 'oem_orbisregion', 'oem_fctregion', 'SUPPLIER_ID', 'supplier_fctname', 'supplier_fctregion', 'supplier_orbisregion', 'Sup_NAICS', 'Year']
    cols = cols + [k for k in prod_sup_panel.columns if k not in cols]
    prod_sup_panel = prod_sup_panel[cols]
    prod_sup_panel['SuppLinks_X_Age'] = prod_sup_panel['SuppLinks'] * prod_sup_panel['CumulativeAgeOfLink']
    return prod_sup_panel


def calculatespanbreak(row):
    SpanBreak = 0
    for y in range(row['Year_start'] + 1, row['Year_end']):
        if row['Active{}'.format(y)] is False:
            SpanBreak += 1
    return SpanBreak


def create_crosssection_oem_supplier(dataSC, dataSC_agg, prod_sup_panel):
    # CREATE CROSSSECTION PROD SUPP
    prod_sup_cross = pd.DataFrame(dataSC.groupby(['OEM_Level1_ID', 'SUPPLIER_ID'])['Year_start'].min())
    prod_sup_cross['Year_end'] = dataSC.groupby(['OEM_Level1_ID', 'SUPPLIER_ID'])['Year_end'].max()
    prod_sup_cross['START$'] = dataSC.groupby(['OEM_Level1_ID', 'SUPPLIER_ID'])['START$'].min()
    prod_sup_cross['END$'] = dataSC.groupby(['OEM_Level1_ID', 'SUPPLIER_ID'])['END$'].max()
    prod_sup_cross['Span'] = prod_sup_cross['END$'] - prod_sup_cross['START$']
    prod_sup_cross['SpanMonths'] = prod_sup_cross['Span'] / np.timedelta64(1, 'M')
    prod_sup_cross['SpanYears'] = prod_sup_cross['Span'] / np.timedelta64(1, 'Y')
    # if the relationship was interrupted in the middle, calculate span of interruptions
    dataSC_agg = dataSC_agg.merge(prod_sup_cross.reset_index(), on=['OEM_Level1_ID', 'SUPPLIER_ID'], how='outer')
    dataSC_agg = dataSC_agg.set_index(['OEM_Level1_ID', 'SUPPLIER_ID'])
    dataSC_agg['Year_start'] = dataSC_agg['Year_start'].astype(int)
    dataSC_agg['Year_end'] = dataSC_agg['Year_end'].astype(int)
    dataSC_agg['SpanBreak'] = dataSC_agg.apply(lambda row: calculatespanbreak(row), axis=1)
    dataSC_agg = dataSC_agg.reset_index()
    prod_sup_cross = prod_sup_cross.merge(dataSC_agg[['OEM_Level1_ID', 'SUPPLIER_ID', 'SpanBreak']], on=['OEM_Level1_ID', 'SUPPLIER_ID'], how='outer')
    prod_sup_cross['EffSpanYears'] = prod_sup_cross['SpanYears'] - prod_sup_cross['SpanBreak']
    prod_sup_cross = prod_sup_cross.merge(dataSC[['SUPPLIER_ID', 'Sup_NAICS']].drop_duplicates(), on='SUPPLIER_ID', how='left')
    df = dataSC_agg[['OEM_Level1_ID', 'Level1_MLName', 'oem_orbisregion', 'oem_fctregion', 'SUPPLIER_ID', 'supplier_fctname', 'supplier_fctregion', 'supplier_orbisregion']].drop_duplicates()
    prod_sup_cross = prod_sup_cross.merge(df, on=['OEM_Level1_ID', 'SUPPLIER_ID'], how='left')
    cols = ['OEM_Level1_ID', 'Level1_MLName', 'oem_orbisregion', 'oem_fctregion', 'SUPPLIER_ID', 'supplier_fctname', 'supplier_fctregion', 'supplier_orbisregion', 'Sup_NAICS']
    cols = cols + [k for k in prod_sup_cross.columns if k not in cols]
    prod_sup_cross = prod_sup_cross[cols]
    prod_sup_cross = prod_sup_cross.set_index(['OEM_Level1_ID', 'SUPPLIER_ID'])
    maskActive = prod_sup_panel['Active'] == True
    prod_sup_cross['SuppLinks_yearlymean'] = prod_sup_panel[maskActive].groupby(['OEM_Level1_ID', 'SUPPLIER_ID'])['SuppLinks'].mean()
    prod_sup_cross['SuppLinks_yearlymin'] = prod_sup_panel[maskActive].groupby(['OEM_Level1_ID', 'SUPPLIER_ID'])['SuppLinks'].min()
    prod_sup_cross['SuppLinks_yearlymax'] = prod_sup_panel[maskActive].groupby(['OEM_Level1_ID', 'SUPPLIER_ID'])['SuppLinks'].max()
    prod_sup_cross['SuppLinks_yearlymean_X_EffSpanYears'] = prod_sup_cross['SuppLinks_yearlymean'] * prod_sup_cross['EffSpanYears']
    maskNAICScr = prod_sup_cross['Sup_NAICS'] == True
    Sum_EffSpanYears_col = prod_sup_cross[maskNAICScr].groupby('OEM_Level1_ID')['EffSpanYears'].sum().rename('Sum_EffSpanYears')
    prod_sup_cross = prod_sup_cross.reset_index().merge(Sum_EffSpanYears_col, on='OEM_Level1_ID').set_index(['OEM_Level1_ID', 'SUPPLIER_ID'])
    prod_sup_cross['SuppLinks_mean_weighted_EffSpanYears'] = prod_sup_cross['SuppLinks_yearlymean_X_EffSpanYears'] / prod_sup_cross['Sum_EffSpanYears']
    prod_sup_cross = prod_sup_cross.reset_index()
    return prod_sup_cross


def create_oem_panel(prod_sup_panel):
    # CREATE PANEL PROD YEAR
    maskActive = prod_sup_panel['Active'] == True
    maskNAICS = prod_sup_panel['Sup_NAICS'] == True
    maskShared = prod_sup_panel['SuppLinks'] > 1
    prod_panel = pd.DataFrame(prod_sup_panel[maskActive].groupby(['OEM_Level1_ID', 'Year'])['SUPPLIER_ID'].nunique()).rename(columns={'SUPPLIER_ID': 'NbrSupp_all'})
    prod_panel['NbrSupp'] = prod_sup_panel[maskActive & maskNAICS].groupby(['OEM_Level1_ID', 'Year'])['SUPPLIER_ID'].nunique()
    prod_panel['NbrSuppShared'] = prod_sup_panel[maskActive & maskNAICS & maskShared].groupby(['OEM_Level1_ID', 'Year'])['SUPPLIER_ID'].nunique()
    prod_panel['NbrSuppShared_all'] = prod_sup_panel[maskActive & maskShared].groupby(['OEM_Level1_ID', 'Year'])['SUPPLIER_ID'].nunique()
    maskShared = prod_sup_panel['SuppLinks'] == 2
    prod_panel['NbrSuppShared_by2'] = prod_sup_panel[maskActive & maskNAICS & maskShared].groupby(['OEM_Level1_ID', 'Year'])['SUPPLIER_ID'].nunique()
    prod_panel['NbrSuppShared_by2_all'] = prod_sup_panel[maskActive & maskShared].groupby(['OEM_Level1_ID', 'Year'])['SUPPLIER_ID'].nunique()
    maskShared = (prod_sup_panel['SuppLinks'] >= 2) & (prod_sup_panel['SuppLinks'] <= 5)
    prod_panel['NbrSuppShared_by2to5'] = prod_sup_panel[maskActive & maskNAICS & maskShared].groupby(['OEM_Level1_ID', 'Year'])['SUPPLIER_ID'].nunique()
    prod_panel['NbrSuppShared_by2to5_all'] = prod_sup_panel[maskActive & maskShared].groupby(['OEM_Level1_ID', 'Year'])['SUPPLIER_ID'].nunique()
    maskShared = (prod_sup_panel['SuppLinks'] >= 6) & (prod_sup_panel['SuppLinks'] <= 9)
    prod_panel['NbrSuppShared_by6to9'] = prod_sup_panel[maskActive & maskNAICS & maskShared].groupby(['OEM_Level1_ID', 'Year'])['SUPPLIER_ID'].nunique()
    prod_panel['NbrSuppShared_by6to9_all'] = prod_sup_panel[maskActive & maskShared].groupby(['OEM_Level1_ID', 'Year'])['SUPPLIER_ID'].nunique()
    maskShared = prod_sup_panel['SuppLinks'] >= 10
    prod_panel['NbrSuppShared_by10+'] = prod_sup_panel[maskActive & maskNAICS & maskShared].groupby(['OEM_Level1_ID', 'Year'])['SUPPLIER_ID'].nunique()
    prod_panel['NbrSuppShared_by10+_all'] = prod_sup_panel[maskActive & maskShared].groupby(['OEM_Level1_ID', 'Year'])['SUPPLIER_ID'].nunique()
    prod_panel['NbrSuppLinks_mean'] = prod_sup_panel[maskActive & maskNAICS].groupby(['OEM_Level1_ID', 'Year'])['SuppLinks'].mean()
    prod_panel['NbrSuppLinks_std'] = prod_sup_panel[maskActive & maskNAICS].groupby(['OEM_Level1_ID', 'Year'])['SuppLinks'].std()
    prod_panel['NbrSuppLinks_25th'] = prod_sup_panel[maskActive & maskNAICS].groupby(['OEM_Level1_ID', 'Year'])['SuppLinks'].quantile(0.25)
    prod_panel['NbrSuppLinks_median'] = prod_sup_panel[maskActive & maskNAICS].groupby(['OEM_Level1_ID', 'Year'])['SuppLinks'].median()
    prod_panel['NbrSuppLinks_75th'] = prod_sup_panel[maskActive & maskNAICS].groupby(['OEM_Level1_ID', 'Year'])['SuppLinks'].quantile(0.75)
    prod_panel['NbrSuppLinks_max'] = prod_sup_panel[maskActive & maskNAICS].groupby(['OEM_Level1_ID', 'Year'])['SuppLinks'].max()
    prod_panel['NbrSuppLinks_sum'] = prod_sup_panel[maskActive & maskNAICS].groupby(['OEM_Level1_ID', 'Year'])['SuppLinks'].sum()
    prod_panel['NbrSuppLinks_mean_all'] = prod_sup_panel[maskActive].groupby(['OEM_Level1_ID', 'Year'])['SuppLinks'].mean()
    prod_panel['NbrSuppLinks_std_all'] = prod_sup_panel[maskActive].groupby(['OEM_Level1_ID', 'Year'])['SuppLinks'].std()
    prod_panel['NbrSuppLinks_25th_all'] = prod_sup_panel[maskActive].groupby(['OEM_Level1_ID', 'Year'])['SuppLinks'].quantile(0.25)
    prod_panel['NbrSuppLinks_median_all'] = prod_sup_panel[maskActive].groupby(['OEM_Level1_ID', 'Year'])['SuppLinks'].median()
    prod_panel['NbrSuppLinks_75th_all'] = prod_sup_panel[maskActive].groupby(['OEM_Level1_ID', 'Year'])['SuppLinks'].quantile(0.75)
    prod_panel['NbrSuppLinks_max_all'] = prod_sup_panel[maskActive].groupby(['OEM_Level1_ID', 'Year'])['SuppLinks'].max()
    prod_panel['NbrSuppLinks_sum_all'] = prod_sup_panel[maskActive].groupby(['OEM_Level1_ID', 'Year'])['SuppLinks'].sum()
    prod_panel = prod_panel.fillna(0).astype(int)
    prod_panel['PercentShared'] = 100 * prod_panel['NbrSuppShared'] / prod_panel['NbrSupp']
    prod_panel['PercentShared_all'] = 100 * prod_panel['NbrSuppShared_all'] / prod_panel['NbrSupp_all']
    prod_panel['PercentShared_by2to5'] = 100 * prod_panel['NbrSuppShared_by2to5'] / prod_panel['NbrSupp']
    prod_panel['PercentShared_by2to5_all'] = 100 * prod_panel['NbrSuppShared_by2to5_all'] / prod_panel['NbrSupp_all']
    prod_panel['PercentShared_by6to9'] = 100 * prod_panel['NbrSuppShared_by6to9'] / prod_panel['NbrSupp']
    prod_panel['PercentShared_by6to9_all'] = 100 * prod_panel['NbrSuppShared_by6to9_all'] / prod_panel['NbrSupp_all']
    prod_panel['PercentShared_by10+'] = 100 * prod_panel['NbrSuppShared_by10+'] / prod_panel['NbrSupp']
    prod_panel['PercentShared_by10+_all'] = 100 * prod_panel['NbrSuppShared_by10+_all'] / prod_panel['NbrSupp_all']
    prod_panel = prod_panel.reset_index()
    df = prod_sup_panel[['OEM_Level1_ID', 'Level1_MLName', 'oem_orbisregion', 'oem_fctregion']].drop_duplicates()
    prod_panel = prod_panel.merge(df, on=['OEM_Level1_ID'], how='left')
    cols = ['OEM_Level1_ID', 'Level1_MLName', 'oem_orbisregion', 'oem_fctregion', 'Year']
    cols = cols + [k for k in prod_panel.columns if k not in cols]
    prod_panel = prod_panel[cols]
    prod_panel = prod_panel.set_index(['OEM_Level1_ID', 'Year'])
    maskNAICS = prod_sup_panel['Sup_NAICS'] == True
    maskActive = prod_sup_panel['Active'] == True
    prod_panel['CumulativeAgeOfLink_mean'] = prod_sup_panel[maskActive & maskNAICS].groupby(['OEM_Level1_ID', 'Year'])['CumulativeAgeOfLink'].mean()
    prod_panel['SuppLinks_X_Age_mean'] = prod_sup_panel[maskActive & maskNAICS].groupby(['OEM_Level1_ID', 'Year'])['SuppLinks_X_Age'].mean()
    Sum_agelink_ofsuppliersinyear = prod_sup_panel[maskActive & maskNAICS].groupby(['OEM_Level1_ID', 'Year'])['CumulativeAgeOfLink'].sum().rename('Sum_AgeLink')
    prod_panel = prod_panel.reset_index().merge(Sum_agelink_ofsuppliersinyear, on=['OEM_Level1_ID', 'Year']).set_index(['OEM_Level1_ID', 'Year'])
    prod_panel['NbrSuppLinks_mean_weighted_by_age'] = prod_panel['SuppLinks_X_Age_mean'] / prod_panel['Sum_AgeLink']
    prod_panel = prod_panel.reset_index()
    return prod_panel



def create_oem_cross(prod_sup_cross, prod_sup_panel, prod_panel):
    # CREATE CROSS PROD
    maskNAICScr = prod_sup_cross['Sup_NAICS'] == True
    maskNAICSpa = prod_sup_panel['Sup_NAICS'] == True
    prod_cross = pd.DataFrame(prod_sup_cross.groupby(['OEM_Level1_ID'])['SUPPLIER_ID'].nunique()).rename(columns={'SUPPLIER_ID': 'NbrSupp_all'})
    prod_cross['NbrSupp'] = prod_sup_cross[maskNAICScr].groupby(['OEM_Level1_ID'])['SUPPLIER_ID'].nunique()
    maskActive = prod_sup_panel['Active'] == True
    maskShared = prod_sup_panel['SuppLinks'] > 1
    prod_cross['NbrSuppShared'] = prod_sup_panel[maskNAICSpa & maskActive & maskShared].groupby(['OEM_Level1_ID'])['SUPPLIER_ID'].nunique()
    prod_cross['NbrSuppShared_all'] = prod_sup_panel[maskActive & maskShared].groupby(['OEM_Level1_ID'])['SUPPLIER_ID'].nunique()
    prod_cross['NbrSuppLinks_sum'] = prod_sup_panel[maskNAICSpa & maskActive].groupby(['OEM_Level1_ID'])['SuppLinks'].sum()
    prod_cross['NbrSuppLinks_sum_all'] = prod_sup_panel[maskActive].groupby(['OEM_Level1_ID'])['SuppLinks'].sum()
    maskShared = prod_sup_panel['SuppLinks'] == 2
    prod_cross['NbrSuppShared_by2'] = prod_sup_panel[maskActive & maskNAICSpa & maskShared].groupby(['OEM_Level1_ID'])['SUPPLIER_ID'].nunique()
    prod_cross['NbrSuppShared_by2_all'] = prod_sup_panel[maskActive & maskShared].groupby(['OEM_Level1_ID'])['SUPPLIER_ID'].nunique()
    maskShared = (prod_sup_panel['SuppLinks'] >= 2) & (prod_sup_panel['SuppLinks'] <= 5)
    prod_cross['NbrSuppShared_by2to5'] = prod_sup_panel[maskActive & maskNAICSpa & maskShared].groupby(['OEM_Level1_ID'])['SUPPLIER_ID'].nunique()
    prod_cross['NbrSuppShared_by2to5_all'] = prod_sup_panel[maskActive & maskShared].groupby(['OEM_Level1_ID'])['SUPPLIER_ID'].nunique()
    maskShared = (prod_sup_panel['SuppLinks'] >= 6) & (prod_sup_panel['SuppLinks'] <= 9)
    prod_cross['NbrSuppShared_by6to9'] = prod_sup_panel[maskActive & maskNAICSpa & maskShared].groupby(['OEM_Level1_ID'])['SUPPLIER_ID'].nunique()
    prod_cross['NbrSuppShared_by6to9_all'] = prod_sup_panel[maskActive & maskShared].groupby(['OEM_Level1_ID'])['SUPPLIER_ID'].nunique()
    maskShared = prod_sup_panel['SuppLinks'] >= 10
    prod_cross['NbrSuppShared_by10+'] = prod_sup_panel[maskActive & maskNAICSpa & maskShared].groupby(['OEM_Level1_ID'])['SUPPLIER_ID'].nunique()
    prod_cross['NbrSuppShared_by10+_all'] = prod_sup_panel[maskActive & maskShared].groupby(['OEM_Level1_ID'])['SUPPLIER_ID'].nunique()
    prod_cross['NbrSuppLinks_mean'] = prod_sup_panel[maskActive & maskNAICSpa].groupby(['OEM_Level1_ID'])['SuppLinks'].mean()
    prod_cross['NbrSuppLinks_std'] = prod_sup_panel[maskActive & maskNAICSpa].groupby(['OEM_Level1_ID'])['SuppLinks'].std()
    prod_cross['NbrSuppLinks_25th'] = prod_sup_panel[maskActive & maskNAICSpa].groupby(['OEM_Level1_ID'])['SuppLinks'].quantile(0.25)
    prod_cross['NbrSuppLinks_median'] = prod_sup_panel[maskActive & maskNAICSpa].groupby(['OEM_Level1_ID'])['SuppLinks'].median()
    prod_cross['NbrSuppLinks_75th'] = prod_sup_panel[maskActive & maskNAICSpa].groupby(['OEM_Level1_ID'])['SuppLinks'].quantile(0.75)
    prod_cross['NbrSuppLinks_max'] = prod_sup_panel[maskActive & maskNAICSpa].groupby(['OEM_Level1_ID'])['SuppLinks'].max()
    prod_cross['NbrSuppLinks_mean_all'] = prod_sup_panel[maskActive].groupby(['OEM_Level1_ID'])['SuppLinks'].mean()
    prod_cross['NbrSuppLinks_std_all'] = prod_sup_panel[maskActive].groupby(['OEM_Level1_ID'])['SuppLinks'].std()
    prod_cross['NbrSuppLinks_25th_all'] = prod_sup_panel[maskActive].groupby(['OEM_Level1_ID'])['SuppLinks'].quantile(0.25)
    prod_cross['NbrSuppLinks_median_all'] = prod_sup_panel[maskActive].groupby(['OEM_Level1_ID'])['SuppLinks'].median()
    prod_cross['NbrSuppLinks_75th_all'] = prod_sup_panel[maskActive].groupby(['OEM_Level1_ID'])['SuppLinks'].quantile(0.75)
    prod_cross['NbrSuppLinks_max_all'] = prod_sup_panel[maskActive].groupby(['OEM_Level1_ID'])['SuppLinks'].max()
    prod_cross['EffSpanYears_mean'] = prod_sup_cross[maskNAICScr].groupby(['OEM_Level1_ID'])['EffSpanYears'].mean()
    prod_cross['EffSpanYears_std'] = prod_sup_cross[maskNAICScr].groupby(['OEM_Level1_ID'])['EffSpanYears'].std()
    prod_cross['EffSpanYears_25th'] = prod_sup_cross[maskNAICScr].groupby(['OEM_Level1_ID'])['EffSpanYears'].quantile(.25)
    prod_cross['EffSpanYears_median'] = prod_sup_cross[maskNAICScr].groupby(['OEM_Level1_ID'])['EffSpanYears'].median()
    prod_cross['EffSpanYears_75th'] = prod_sup_cross[maskNAICScr].groupby(['OEM_Level1_ID'])['EffSpanYears'].quantile(.75)
    prod_cross['EffSpanYears_max'] = prod_sup_cross[maskNAICScr].groupby(['OEM_Level1_ID'])['EffSpanYears'].max()
    prod_cross['EffSpanYears_mean_all'] = prod_sup_cross.groupby(['OEM_Level1_ID'])['EffSpanYears'].mean()
    prod_cross['EffSpanYears_std_all'] = prod_sup_cross.groupby(['OEM_Level1_ID'])['EffSpanYears'].std()
    prod_cross['EffSpanYears_25th_all'] = prod_sup_cross.groupby(['OEM_Level1_ID'])['EffSpanYears'].quantile(.25)
    prod_cross['EffSpanYears_median_all'] = prod_sup_cross.groupby(['OEM_Level1_ID'])['EffSpanYears'].median()
    prod_cross['EffSpanYears_75th_all'] = prod_sup_cross.groupby(['OEM_Level1_ID'])['EffSpanYears'].quantile(.75)
    prod_cross['EffSpanYears_max_all'] = prod_sup_cross.groupby(['OEM_Level1_ID'])['EffSpanYears'].max()
    prod_cross = prod_cross.fillna(0).astype(int)
    prod_cross['PercentShared'] = 100 * prod_cross['NbrSuppShared'] / prod_cross['NbrSupp']
    prod_cross['PercentShared_all'] = 100 * prod_cross['NbrSuppShared_all'] / prod_cross['NbrSupp_all']
    prod_cross['PercentShared_by2to5'] = 100 * prod_cross['NbrSuppShared_by2to5'] / prod_cross['NbrSupp']
    prod_cross['PercentShared_by2to5_all'] = 100 * prod_cross['NbrSuppShared_by2to5_all'] / prod_cross['NbrSupp_all']
    prod_cross['PercentShared_by6to9'] = 100 * prod_cross['NbrSuppShared_by6to9'] / prod_cross['NbrSupp']
    prod_cross['PercentShared_by6to9_all'] = 100 * prod_cross['NbrSuppShared_by6to9_all'] / prod_cross['NbrSupp_all']
    prod_cross['PercentShared_by10+'] = 100 * prod_cross['NbrSuppShared_by10+'] / prod_cross['NbrSupp']
    prod_cross['PercentShared_by10+_all'] = 100 * prod_cross['NbrSuppShared_by10+_all'] / prod_cross['NbrSupp_all']
    prod_cross = prod_cross.reset_index().sort_values('OEM_Level1_ID')
    df = prod_sup_panel[['OEM_Level1_ID', 'Level1_MLName', 'oem_orbisregion', 'oem_fctregion']].drop_duplicates()
    prod_cross = prod_cross.merge(df, on=['OEM_Level1_ID'], how='left')
    cols = ['OEM_Level1_ID', 'Level1_MLName', 'oem_orbisregion', 'oem_fctregion']
    cols = cols + [k for k in prod_cross.columns if k not in cols]
    prod_cross = prod_cross[cols]
    prod_cross = prod_cross.set_index('OEM_Level1_ID')
    maskNAICSpa = prod_sup_panel['Sup_NAICS'] == True
    maskActive = prod_sup_panel['Active'] == True
    prod_cross['SuppLinks_X_Age_mean'] = prod_sup_panel[maskActive & maskNAICSpa].groupby(['OEM_Level1_ID'])['SuppLinks_X_Age'].mean()
    Sum_agelink_suppliersallyears = prod_sup_panel[maskActive & maskNAICSpa].groupby(['OEM_Level1_ID'])['CumulativeAgeOfLink'].sum().rename('Sum_AgeLink')
    prod_cross = prod_cross.reset_index().merge(Sum_agelink_suppliersallyears, on=['OEM_Level1_ID']).set_index(['OEM_Level1_ID'])
    prod_cross['NbrSuppLinks_mean_weighted_by_age'] = prod_cross['SuppLinks_X_Age_mean'] / prod_cross['Sum_AgeLink']
    maskNAICScr = prod_sup_cross['Sup_NAICS'] == True
    prod_cross['SuppLinks_mean_X_EffSpanYears'] = prod_sup_cross[maskNAICScr].groupby('OEM_Level1_ID')['SuppLinks_yearlymean_X_EffSpanYears'].mean()
    prod_cross['SuppLinks_mean_weighted_EffSpanYears'] = prod_sup_cross[maskNAICScr].groupby('OEM_Level1_ID')['SuppLinks_mean_weighted_EffSpanYears'].mean()
    prod_cross['NbrSuppLinks_mean_weighted_by_age_meanoveryears'] = prod_panel.groupby('OEM_Level1_ID')['NbrSuppLinks_mean_weighted_by_age'].mean()
    prod_cross = prod_cross.reset_index()
    return prod_cross



def main():
    OEMs = oems_level1()
    dataSC = get_supplychain_data_of_oems(OEMs)
    dataSC = check_if_need_to_add_fctset_ids_fromaliases(dataSC)
    dataSC = check_if_need_to_add_fctset_ids_fromredirect(dataSC)
    dataSC = create_year_variables(dataSC)
    dataSC = flag_suppliers_with_relevant_NAICS(dataSC)
    dataSC_agg = aggregate_at_OEM_Level1_and_supplier_level(dataSC)
    # Adding info about names and regions
    oem_info = add_oems_name_and_region(dataSC)
    suppliers_info = add_suppliers_name_and_region()
    supplier_year_first_appears = add_year_Supp_appears_in_FctSet(dataSC)
    suppliers_info = suppliers_info.merge(supplier_year_first_appears,on = 'SUPPLIER_ID', how='left')
    names_regions = dataSC_agg[['OEM_Level1_ID', 'SUPPLIER_ID']].merge(oem_info, on='OEM_Level1_ID', how='left').merge(suppliers_info, on='SUPPLIER_ID', how='left')
    dataSC_agg = names_regions.merge(dataSC_agg, on=['OEM_Level1_ID', 'SUPPLIER_ID'], how='left')
    # transform into a long panel format
    prod_sup_panel = create_panel_oem_supplier_year(dataSC_agg)
    prod_sup_panel.to_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/SC_ProdSuppPanel.csv', index=False)
    prod_sup_cross = create_crosssection_oem_supplier(dataSC, dataSC_agg, prod_sup_panel)
    prod_sup_cross.to_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/SC_ProdSuppCross.csv', index=False)
    prod_panel = create_oem_panel(prod_sup_panel)
    prod_panel.to_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/SC_ProdPanel.csv', index=False)
    prod_cross = create_oem_cross(prod_sup_cross, prod_sup_panel, prod_panel)
    prod_cross.to_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/SC_ProdCross.csv', index=False)
